MAS202: Final project on housing prices in Iceland 2008-2018

Part 1: Clean the data

library(tidyverse)
Data=read.csv("https://www.skra.is/library/Samnyttar-skrar-/Fyrirtaeki-stofnanir/Fasteignamat-2019/gagnasafn_ib_2018.csv",sep = ";",fileEncoding="latin1", na.strings = "(null)")
dim(Data)
## [1] 49777    53
Data=na.omit(Data)#remove missing values

We can see that the dataset has 49777 observations and 53 variables. We use the variables which is suggested.
netto area is similar to area of property so, we discarded it along with circumference. Finally, we have all the information we considered to be sufficient to progress our part 1 cleaning data.
ibteg and teg_eign carry on the same information so I consider teg_egin.
Property id is irrelevant. So, should be removed.
storage rooms is also removed, since we have storage room area variable.

C1=c("kdagur","nuvirdi", "teg_eign", "svfn", "byggar", "efstah", "fjmib", "lyfta","ibm2","fjhaed","fjbilsk","fjbkar", "fjsturt" , "fjklos","fjeld", "fjherb","fjstof", "stig10", "bilskurm2", "svalm2", "geymm2", "matssvaedi", "undirmatssvaedi")

We set the condition on our data contains only capital area, svfn<1606, with only residential housing. Excluding, Hotels, Gust-houses, Offices, and illegal apartment.

Since Fjölbýlishús, Íbúðareign and Íbúðarhús introduce the same object. So, we named Íbúðareign, apartment, for all of the three variables.

Data$ibm2=as.numeric(gsub(",",".",Data$ibm2))

Datasub=subset(Data, select = C1) 
Datasub$svalm2=as.numeric(gsub(",",".",Datasub$svalm2))
Datasub$geymm2=as.numeric(gsub(",",".",Datasub$geymm2))
Datasub$bilskurm2=as.numeric(gsub(",",".",Datasub$bilskurm2))
Datasub$stig10=as.integer(Datasub$stig10)
Datasub=na.omit(Datasub)
Datasub %>% summary()
##     kdagur             nuvirdi         teg_eign              svfn     
##  Length:43601       Min.   :   212   Length:43601       Min.   :   0  
##  Class :character   1st Qu.: 22051   Class :character   1st Qu.:   0  
##  Mode  :character   Median : 29774   Mode  :character   Median :1000  
##                     Mean   : 33165                      Mean   :1919  
##                     3rd Qu.: 40444                      3rd Qu.:2000  
##                     Max.   :956146                      Max.   :8722  
##      byggar         efstah           fjmib            lyfta       
##  Min.   :1787   Min.   : 0.000   Min.   : 1.000   Min.   :0.0000  
##  1st Qu.:1963   1st Qu.: 2.000   1st Qu.: 1.000   1st Qu.:0.0000  
##  Median :1981   Median : 3.000   Median : 1.000   Median :0.0000  
##  Mean   :1980   Mean   : 2.961   Mean   : 1.007   Mean   :0.3474  
##  3rd Qu.:2003   3rd Qu.: 4.000   3rd Qu.: 1.000   3rd Qu.:0.0000  
##  Max.   :2018   Max.   :13.000   Max.   :37.000   Max.   :8.0000  
##       ibm2            fjhaed         fjbilsk           fjbkar       
##  Min.   :  17.7   Min.   :1.000   Min.   :0.0000   Min.   : 0.0000  
##  1st Qu.:  78.2   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 1.0000  
##  Median : 100.0   Median :1.000   Median :0.0000   Median : 1.0000  
##  Mean   : 108.3   Mean   :1.211   Mean   :0.1593   Mean   : 0.7643  
##  3rd Qu.: 127.4   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.: 1.0000  
##  Max.   :1510.3   Max.   :4.000   Max.   :8.0000   Max.   :37.0000  
##     fjsturt           fjklos           fjeld           fjherb      
##  Min.   :0.0000   Min.   : 0.000   Min.   :0.000   Min.   : 0.000  
##  1st Qu.:0.0000   1st Qu.: 1.000   1st Qu.:1.000   1st Qu.: 2.000  
##  Median :1.0000   Median : 1.000   Median :1.000   Median : 2.000  
##  Mean   :0.5532   Mean   : 1.205   Mean   :1.006   Mean   : 2.641  
##  3rd Qu.:1.0000   3rd Qu.: 1.000   3rd Qu.:1.000   3rd Qu.: 3.000  
##  Max.   :6.0000   Max.   :37.000   Max.   :8.000   Max.   :55.000  
##      fjstof           stig10         bilskurm2          svalm2       
##  Min.   : 0.000   Min.   : 4.000   Min.   :-70.00   Min.   : -0.200  
##  1st Qu.: 1.000   1st Qu.:10.000   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median : 1.000   Median :10.000   Median :  0.00   Median :  4.200  
##  Mean   : 1.212   Mean   : 9.979   Mean   : 10.31   Mean   :  6.257  
##  3rd Qu.: 1.000   3rd Qu.:10.000   3rd Qu.: 24.40   3rd Qu.:  8.900  
##  Max.   :14.000   Max.   :10.000   Max.   :134.60   Max.   :269.800  
##      geymm2          matssvaedi   undirmatssvaedi  
##  Min.   :  0.000   Min.   :  11   Min.   :  0.000  
##  1st Qu.:  0.000   1st Qu.: 140   1st Qu.:  0.000  
##  Median :  1.300   Median : 351   Median :  0.000  
##  Mean   :  5.114   Mean   :1693   Mean   :  4.321  
##  3rd Qu.:  7.300   3rd Qu.:2050   3rd Qu.:  0.000  
##  Max.   :175.000   Max.   :8200   Max.   :507.000
library(psych)

summary(Datasub$teg_eign)
##    Length     Class      Mode 
##     43601 character character
Datasub$teg_eign=fct_collapse(Datasub$teg_eign, 
                              Íbúðareign = c("Fjölbýlishús", "Íbúðareign", "Íbúðarhús"),
                              notimport=c("Herbergi", "Hótelstarfsemi", "Ósamþykkt íbúð","Séreign","Vinnustofa" ))

Datasub=subset(Datasub, teg_eign!="notimport")

boxplot(Datasub$fjstof)

boxplot(Datasub$efstah)

# number of the living room should be less than 5.
Datasub=filter(Datasub, Datasub$efstah< 8 &  Datasub$fjeld<3 &  Datasub$fjstof<5 & Datasub$svfn<1606 & Datasub$fjbkar<3)

We replaced numbers with names of capital area and discard real estates outside the capital area. Our data is left with real estate within the capital area.

# Changing these variables into a factor and rename them.
levels(as.factor(Datasub$svfn))
## [1] "0"    "1000" "1100" "1300" "1400" "1604"
Datasub$svfn<-fct_recode(as.factor(Datasub$svfn),
  Reykjavík = "0",
  Kópavogur = "1000",
  Seltjarnarnes = "1100",
  Garðabær = "1300",
  Hafnarfjörður = "1400",
  Mosfellsbær = "1604",
  Kjósarhreppur= "1606"
)
levels(Datasub$svfn)
## [1] "Reykjavík"     "Kópavogur"     "Seltjarnarnes" "Garðabær"     
## [5] "Hafnarfjörður" "Mosfellsbær"

We check correlation with correlation plot, ggcorr, on all explanatory variables in our Datasub that we have cleaned against our response, nuvirdi, which is the price of the properties.

library(GGally)
ggcorr(Datasub,label=T, label_size = 2)

#ggcorr(Datasub,label=T, label_size = 3, palette = "RdBu",geom = "circle", nbreaks = 5)

The variable nuvirdi has no relation with stig10 and efstah. But ibm2 has the highest correlation with nuvirdi.

Since data in column(s) kdagur, teg_eign, and svfn are logistic. So, they were ignored. We will look at them in a different plot.

2. Construct descriptive plots.

ggplot(Datasub) + geom_boxplot(mapping=aes(x=teg_eign, y=nuvirdi)) + 
  labs(x="type of properties", y="nuvirdi = price")

# It is not clear because there is a point outside of range which seems to be error, if it is error we have to remove it and repeat ggplot.

Datasub[which.max(Datasub$nuvirdi),]#It is a error.
##           kdagur nuvirdi   teg_eign        svfn byggar efstah fjmib lyfta  ibm2
## 20411 27.05.2016  956146 Íbúðareign Mosfellsbær   2016      3     1     1 110.5
##       fjhaed fjbilsk fjbkar fjsturt fjklos fjeld fjherb fjstof stig10 bilskurm2
## 20411      1       1      0       1      1     1      2      1     10         0
##       svalm2 geymm2 matssvaedi undirmatssvaedi
## 20411   19.5    4.5        850               0
#therefore, we removed the outlier of our data set.
#Now, the Datasub is more robust.
Datasub=Datasub[-which.max(Datasub$nuvirdi),]
ggplot(Datasub)+geom_boxplot(mapping=aes(x=teg_eign, y=nuvirdi))+scale_y_continuous( limits=c(0, 200000)) + 
  labs(x="type of properties", y="nuvirdi = price")

comment: The boxplot shows that there is one outlier in the íbúðareign. We removed it and re-plot the plotbox. The plotbox results in more consistant plot.

Plot the to see the relationship between nuvirdi and type of properties.

ggplot(Datasub) + 
  geom_density(aes(x=nuvirdi , col=teg_eign)) + 
  scale_x_continuous(limits=c(0, 100000)) +
  labs(x="nuvirdi = price", col="type of properties", title = "Relationship between numbers of properties and price")

# According to the graph and boxplot, Apartments are cheaper 

comments: According to the graph and boxplot, many apartments are relatively cheaper than all other type of properties. While Einbýlishús is the most expensive, but parhús and raðhús are similarly priced. We also note that íbúðareign is the most affordable and the price is relatively low compares to other type of properties with high numbers of sales.

Let us see the relationship between price and Year.

Datasub$year<-as.factor(as.integer(substr(Datasub$kdagur,7,10)))
ggplot(Datasub)+geom_boxplot(mapping=aes(x=year, y=nuvirdi)) + scale_y_continuous(limits=c(0, 250000)) + 
  labs(x="year", y="nuvirdi = price", title="price of the property with year")

# price has an increasing trade respect to years. 

comments: From boxplot without the outliers, we can see the relationship on the price and year is linear and the price increases with years.

Firstly, we will take a look at average price of each type of properties and how many of them are solds each year in each type of the properties. Secondly, we will make a plot of the table to visualise the relationship of number in each type of properties sold and price of each type of properties sold.

library(dplyr)
Group=group_by(Datasub[,c(2,3,24)], year,teg_eign)
count(Group)  %>% tibble()
## # A tibble: 28 x 3
##    year  teg_eign        n
##    <fct> <fct>       <int>
##  1 2012  Einbýlishús   338
##  2 2012  Íbúðareign   2972
##  3 2012  Parhús         80
##  4 2012  Raðhús        234
##  5 2013  Einbýlishús   384
##  6 2013  Íbúðareign   3252
##  7 2013  Parhús         88
##  8 2013  Raðhús        280
##  9 2014  Einbýlishús   419
## 10 2014  Íbúðareign   3564
## # … with 18 more rows
dplyr::summarise(Group,avg_price=mean(nuvirdi) ,.groups = 'drop')%>%
ggplot(mapping=aes(x=year, y=avg_price, col=teg_eign, group = teg_eign))+ geom_point() + geom_line()

comments: This graph shows us the price increases of each type of property from year 2012-2018. Since, there are few observations in 2018, the increase in price from 2017-2018 is not consistent compare to the year 2012-2017 i.e. Seltjarnarnes has no sales recored in 2018 and Mosfellsbaer sales are not consisist with the previous years and the year before.. 

Now, average price of each \(m^2\) area of properties in different area of the city over the years is investigated.

Datasub$pom2=(Datasub$nuvirdi)/(Datasub$ibm2)
Group1=group_by(Datasub[,c(3,4,24,25)], year,teg_eign, svfn)
count(Group1)
## # A tibble: 166 x 4
## # Groups:   year, teg_eign, svfn [166]
##    year  teg_eign    svfn              n
##    <fct> <fct>       <fct>         <int>
##  1 2012  Einbýlishús Reykjavík       128
##  2 2012  Einbýlishús Kópavogur        61
##  3 2012  Einbýlishús Seltjarnarnes    11
##  4 2012  Einbýlishús Garðabær         53
##  5 2012  Einbýlishús Hafnarfjörður    59
##  6 2012  Einbýlishús Mosfellsbær      26
##  7 2012  Íbúðareign  Reykjavík      1787
##  8 2012  Íbúðareign  Kópavogur       497
##  9 2012  Íbúðareign  Seltjarnarnes    45
## 10 2012  Íbúðareign  Garðabær        153
## # … with 156 more rows
dplyr::summarise(Group1,avg_price.omsq=mean(pom2) ,.groups = 'drop')%>%
ggplot(mapping=aes(x=year, y=avg_price.omsq, col=teg_eign, group = teg_eign)) + geom_point() + geom_line() + labs(x="Year", y="Avg of price per m2", fill="City")+facet_wrap(~svfn)

3 Predict sale prices nuvirdi.

Let us split data into test and train data.
We have used Tree, Random Forest, Bagging, Boosting, and Lasso methods.

Tree method

Regression tree analysis of the data and the unprunned tree.
The results from top-down greedy splitting on the training data will shown.

We use the rpart.plot package, but tree in this package is already pruned. So, we do not need to use the cv.tree function to find the cross-validation and finding the best Knot then prune the tree. 

Datasub=na.omit(Datasub)
library(rpart.plot)
library(tree)
set.seed(1)
train=sample(1:nrow(Datasub),0.8*nrow(Datasub))
trainset=Datasub[train,-c(1,25)]
testset=Datasub[-train,-c(1,25)]

tree.method=rpart(nuvirdi~., data = trainset)
summary(tree.method)
## Call:
## rpart(formula = nuvirdi ~ ., data = trainset)
##   n= 23004 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.41976577      0 1.0000000 1.0001084 0.020663723
## 2 0.06543543      1 0.5802342 0.5803175 0.014281899
## 3 0.06372539      2 0.5147988 0.5184676 0.011529600
## 4 0.03776862      3 0.4510734 0.4547497 0.011370046
## 5 0.03166658      4 0.4133048 0.4207011 0.011107433
## 6 0.02793308      5 0.3816382 0.3863225 0.011058127
## 7 0.01673886      6 0.3537051 0.3588938 0.010982530
## 8 0.01000000      7 0.3369663 0.3409392 0.009965223
## 
## Variable importance
##      ibm2    fjherb    fjklos  teg_eign bilskurm2    fjhaed      year    byggar 
##        31        15        14        13        10         9         7         1 
## 
## Node number 1: 23004 observations,    complexity param=0.4197658
##   mean=37442.67, MSE=2.903609e+08 
##   left son=2 (16986 obs) right son=3 (6018 obs)
##   Primary splits:
##       ibm2      < 122.85 to the left,  improve=0.4197658, (0 missing)
##       fjklos    < 1.5    to the left,  improve=0.3358032, (0 missing)
##       teg_eign  splits as  RL--RR,     improve=0.3222097, (0 missing)
##       bilskurm2 < 19.35  to the left,  improve=0.2680578, (0 missing)
##       fjherb    < 3.5    to the left,  improve=0.2486505, (0 missing)
##   Surrogate splits:
##       fjklos    < 1.5    to the left,  agree=0.881, adj=0.545, (0 split)
##       teg_eign  splits as  RL--RR,     agree=0.881, adj=0.545, (0 split)
##       fjherb    < 3.5    to the left,  agree=0.871, adj=0.505, (0 split)
##       bilskurm2 < 19.35  to the left,  agree=0.847, adj=0.414, (0 split)
##       fjhaed    < 1.5    to the left,  agree=0.833, adj=0.362, (0 split)
## 
## Node number 2: 16986 observations,    complexity param=0.06372539
##   mean=30871.34, MSE=1.008232e+08 
##   left son=4 (10312 obs) right son=5 (6674 obs)
##   Primary splits:
##       year    splits as  LLLLRRR,    improve=0.24854350, (0 missing)
##       ibm2    < 91.15  to the left,  improve=0.22953540, (0 missing)
##       fjherb  < 1.5    to the left,  improve=0.11826840, (0 missing)
##       byggar  < 2013.5 to the left,  improve=0.10049790, (0 missing)
##       fjbilsk < 0.5    to the left,  improve=0.06540357, (0 missing)
##   Surrogate splits:
##       byggar          < 2015.5 to the left,  agree=0.642, adj=0.088, (0 split)
##       matssvaedi      < 845    to the left,  agree=0.612, adj=0.012, (0 split)
##       stig10          < 7.5    to the right, agree=0.611, adj=0.009, (0 split)
##       undirmatssvaedi < 105.5  to the left,  agree=0.608, adj=0.002, (0 split)
##       svalm2          < 35.05  to the left,  agree=0.608, adj=0.001, (0 split)
## 
## Node number 3: 6018 observations,    complexity param=0.06543543
##   mean=55990.46, MSE=3.594333e+08 
##   left son=6 (4725 obs) right son=7 (1293 obs)
##   Primary splits:
##       ibm2     < 194.75 to the left,  improve=0.20206170, (0 missing)
##       year     splits as  LLLLRRR,    improve=0.17314160, (0 missing)
##       teg_eign splits as  RL--LL,     improve=0.11194130, (0 missing)
##       fjklos   < 2.5    to the left,  improve=0.09220599, (0 missing)
##       fjsturt  < 1.5    to the left,  improve=0.09100026, (0 missing)
##   Surrogate splits:
##       fjherb  < 5.5    to the left,  agree=0.834, adj=0.228, (0 split)
##       fjklos  < 2.5    to the left,  agree=0.831, adj=0.213, (0 split)
##       fjsturt < 2.5    to the left,  agree=0.793, adj=0.036, (0 split)
##       fjeld   < 1.5    to the left,  agree=0.787, adj=0.011, (0 split)
##       fjstof  < 3.5    to the left,  agree=0.787, adj=0.010, (0 split)
## 
## Node number 4: 10312 observations,    complexity param=0.03166658
##   mean=26844.15, MSE=6.38151e+07 
##   left son=8 (6240 obs) right son=9 (4072 obs)
##   Primary splits:
##       ibm2    < 92.95  to the left,  improve=0.3214225, (0 missing)
##       fjherb  < 1.5    to the left,  improve=0.1618782, (0 missing)
##       byggar  < 1997.5 to the left,  improve=0.1220823, (0 missing)
##       year    splits as  LLRR---,    improve=0.1073710, (0 missing)
##       fjbilsk < 0.5    to the left,  improve=0.1024092, (0 missing)
##   Surrogate splits:
##       fjherb    < 2.5    to the left,  agree=0.792, adj=0.473, (0 split)
##       bilskurm2 < 18.05  to the left,  agree=0.645, adj=0.102, (0 split)
##       svalm2    < 8.15   to the left,  agree=0.643, adj=0.096, (0 split)
##       byggar    < 1998.5 to the left,  agree=0.638, adj=0.084, (0 split)
##       fjbilsk   < 0.5    to the left,  agree=0.637, adj=0.080, (0 split)
## 
## Node number 5: 6674 observations,    complexity param=0.02793308
##   mean=37093.77, MSE=9.422687e+07 
##   left son=10 (3535 obs) right son=11 (3139 obs)
##   Primary splits:
##       ibm2   < 88.05  to the left,  improve=0.29668750, (0 missing)
##       fjherb < 1.5    to the left,  improve=0.16527970, (0 missing)
##       byggar < 1997.5 to the left,  improve=0.10731570, (0 missing)
##       year   splits as  ----LRR,    improve=0.09391178, (0 missing)
##       geymm2 < 6.75   to the left,  improve=0.08674808, (0 missing)
##   Surrogate splits:
##       fjherb     < 2.5    to the left,  agree=0.765, adj=0.501, (0 split)
##       byggar     < 1991.5 to the left,  agree=0.623, adj=0.199, (0 split)
##       svalm2     < 6.25   to the left,  agree=0.617, adj=0.187, (0 split)
##       matssvaedi < 325    to the left,  agree=0.607, adj=0.165, (0 split)
##       fjbilsk    < 0.5    to the left,  agree=0.600, adj=0.149, (0 split)
## 
## Node number 6: 4725 observations,    complexity param=0.03776862
##   mean=51532.36, MSE=2.032888e+08 
##   left son=12 (2819 obs) right son=13 (1906 obs)
##   Primary splits:
##       year     splits as  LLLLRRR,    improve=0.26263790, (0 missing)
##       ibm2     < 145.35 to the left,  improve=0.09562307, (0 missing)
##       byggar   < 2015.5 to the left,  improve=0.05396298, (0 missing)
##       teg_eign splits as  RL--RL,     improve=0.04875036, (0 missing)
##       fjklos   < 1.5    to the left,  improve=0.04428063, (0 missing)
##   Surrogate splits:
##       byggar     < 2014.5 to the left,  agree=0.630, adj=0.082, (0 split)
##       lyfta      < 3.5    to the left,  agree=0.601, adj=0.012, (0 split)
##       stig10     < 7.5    to the right, agree=0.599, adj=0.006, (0 split)
##       matssvaedi < 845    to the left,  agree=0.599, adj=0.005, (0 split)
##       geymm2     < 103.05 to the left,  agree=0.598, adj=0.003, (0 split)
## 
## Node number 7: 1293 observations,    complexity param=0.01673886
##   mean=72281.64, MSE=5.920005e+08 
##   left son=14 (1032 obs) right son=15 (261 obs)
##   Primary splits:
##       year       splits as  LLLLLRR,    improve=0.14606520, (0 missing)
##       ibm2       < 300.75 to the left,  improve=0.10242770, (0 missing)
##       teg_eign   splits as  RR--LL,     improve=0.07617583, (0 missing)
##       matssvaedi < 82.5   to the right, improve=0.06742344, (0 missing)
##       svfn       splits as  LLRRLL,     improve=0.06235629, (0 missing)
##   Surrogate splits:
##       byggar < 2016   to the left,  agree=0.801, adj=0.015, (0 split)
##       stig10 < 6.5    to the right, agree=0.800, adj=0.008, (0 split)
##       svalm2 < 147.75 to the left,  agree=0.800, adj=0.008, (0 split)
## 
## Node number 8: 6240 observations
##   mean=23185.57, MSE=3.464252e+07 
## 
## Node number 9: 4072 observations
##   mean=32450.6, MSE=5.657571e+07 
## 
## Node number 10: 3535 observations
##   mean=32111.38, MSE=4.815564e+07 
## 
## Node number 11: 3139 observations
##   mean=42704.72, MSE=8.667155e+07 
## 
## Node number 12: 2819 observations
##   mean=45524.09, MSE=1.242129e+08 
## 
## Node number 13: 1906 observations
##   mean=60418.67, MSE=1.878852e+08 
## 
## Node number 14: 1032 observations
##   mean=67605.21, MSE=4.449732e+08 
## 
## Node number 15: 261 observations
##   mean=90772.37, MSE=7.449724e+08
tree.pred=predict(tree.method,testset)
MeanE=mean(abs(tree.pred-testset$nuvirdi))
MeanE
## [1] 6875.938
rpart.plot(tree.method, main="prune Tree")

comment:

Random Forests:
Random Forests is better than bagging because it is decorrelating the trees.
Random forests forced each split to consider only a subset of the predictors, so other less moderate predictors are also consider(instead of only consider strong predictors.) \(m=\sqrt{p}\)

Random Forest

library (randomForest)
set.seed(2)
RandF.method=randomForest(nuvirdi~.,data=trainset,importance =TRUE, na.action=na.roughfix)
RF.pred=predict(RandF.method,newdata=testset)
RFE=mean(abs(RF.pred -testset$nuvirdi))
RFE
## [1] 3292.927
plot(RF.pred , testset$nuvirdi)+abline(0,1)

## integer(0)
varImpPlot(RandF.method, main = "Importance")

# random forest gets smaller error.

Bagging is built on bootstrapped training samples, considered, a random sample of m predictors is chosen as split canditade from the full set of \(m\) predictors. Bagging will use the strong predictor in the top split and the tree will be highly correlated. Resulting in high variance. \(m = p\).

Using mtry=22 in the Bagging method.
# Bagging method

library (randomForest)
set.seed(2)
Bagbing.method=randomForest(nuvirdi~.,data=trainset,mtry=22 ,importance =TRUE, na.action=na.roughfix)
bag.pred=predict(Bagbing.method,newdata=testset)
BagE=mean(abs(bag.pred -testset$nuvirdi))
BagE
## [1] 3353.22
plot(bag.pred , testset$nuvirdi)+abline(0,1)

## integer(0)
varImpPlot(Bagbing.method, main = "Importance")

# randomforest gets smaller error.

Boosting :
Boosting works in a similar way, except that the trees are grown sequentially: each tree is grown using information from previously grown trees. Boosting does not involve bootstrap sampling; instead each tree is fit on a modified version of the original data set.

Boosting

library(gbm)
set.seed(3)
Lambda <- c( seq(0.02, 0.1, by=0.01),seq(0.2, 1, by=0.1))
train.err=rep(NA, length(Lambda))
for (i in 1:length(Lambda)) {
  boost.dat=gbm(nuvirdi~.,data=trainset, distribution="gaussian", n.trees =1000 , interaction.depth =4, shrinkage=Lambda[i])
   pred.dat = predict(boost.dat, testset, n.trees = 1000)
   train.err[i] = mean(abs(pred.dat - testset$nuvirdi))
}
plot(Lambda, train.err, type = "b", xlab = "Shrinkage values", ylab = "Test E")

BostE=min(train.err)
BostE
## [1] 3188.908
Lambda[which.min(train.err)]
## [1] 0.09
boost.best=gbm(nuvirdi~.,data=trainset, distribution="gaussian", n.trees =1000 , interaction.depth =4, shrinkage=Lambda[which.min(train.err)])
summary(boost.best)

##                             var     rel.inf
## ibm2                       ibm2 59.33091834
## year                       year 15.46420091
## matssvaedi           matssvaedi  4.73084696
## byggar                   byggar  3.58214582
## teg_eign               teg_eign  2.40855124
## bilskurm2             bilskurm2  2.40102072
## undirmatssvaedi undirmatssvaedi  2.18254825
## fjklos                   fjklos  2.17659796
## svfn                       svfn  1.98720163
## geymm2                   geymm2  1.64341411
## svalm2                   svalm2  1.37691228
## fjsturt                 fjsturt  0.62790348
## fjherb                   fjherb  0.50553421
## fjbilsk                 fjbilsk  0.44522751
## fjhaed                   fjhaed  0.29177715
## efstah                   efstah  0.28961756
## fjstof                   fjstof  0.18987637
## fjbkar                   fjbkar  0.08997921
## fjeld                     fjeld  0.07878164
## stig10                   stig10  0.07522038
## lyfta                     lyfta  0.06383689
## fjmib                     fjmib  0.05788740

Random Forest and Bagging and Boosting have similar error but boosting has the smallest error. Boosting is the best model with the least MSE.

Lasso

x=model.matrix(nuvirdi~.,Datasub[,-c(1,25)])[,-2]
y=Datasub$nuvirdi
library(glmnet)
grid =10^seq (10,-2, length =100)
lasso.mod =glmnet(x[train ,],y[train],alpha =1, lambda =grid)
plot(lasso.mod)

set.seed(5)
cv.out =cv.glmnet (x[train ,],y[train],alpha =1)
plot(cv.out)

bestlam =cv.out$lambda.min
bestlam 
## [1] 7.040237
lasso.pred=predict(lasso.mod ,s=bestlam ,newx=x[-train ,])
LasoE=mean(abs( lasso.pred -y[-train]))

4. Predict the values of a categorical variable

tree_comp<-data.frame(rbind(random_forest = RFE,
           Tree_Error = MeanE,
           Bagging_Error=BagE,
           Bossting_Error= BostE,
           Lasso_Error = LasoE))

colnames(tree_comp)<-"Estimated_prediction_error"
library(kableExtra)  
kable(tree_comp) %>% 
  kable_styling(bootstrap_options = "striped", full_width = T)
Estimated_prediction_error
random_forest 3292.927
Tree_Error 6875.938
Bagging_Error 3353.220
Bossting_Error 3188.908
Lasso_Error 4694.276

We have to remove the location because of dependency of svfn and location.
We chose location from svfn to be Kópavogur and Hafnarfjörður. Since, they have intersection of areas around them.

Create dataset and split our data.

set.seed(7)
newsub = filter(Datasub[,-c(1,22:25)], svfn == "Kópavogur" | svfn == "Hafnarfjörður")
newsub$svfn = factor(newsub$svfn)
newsub = na.omit(newsub)

split= sample(1:nrow(newsub), nrow(newsub)*0.8)
trainnew = newsub[split,]
testnew = newsub[-split,]

Bagging

library(randomForest)
fit_R.forest = randomForest(svfn ~ ., trainnew, mtry = 7, importance = TRUE )
fit_R.forest
## 
## Call:
##  randomForest(formula = svfn ~ ., data = trainnew, mtry = 7, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 9.53%
## Confusion matrix:
##               Kópavogur Hafnarfjörður class.error
## Kópavogur          3730           252  0.06328478
## Hafnarfjörður       446          2896  0.13345302

Prediction of Bagging model. We use importance to view the importance of each variable and then plot the importance measures with varImpPlot.

pred_R.forest = predict(fit_R.forest, testnew)
cm_R.forest = table(pred_R.forest, testnew$svfn)
cm_R.forest
##                
## pred_R.forest   Kópavogur Hafnarfjörður
##   Kópavogur           953           119
##   Hafnarfjörður        59           701
RFEr=mean(pred_R.forest!=testnew$svfn)
RFEr
## [1] 0.09716157
importance(fit_R.forest)
##            Kópavogur Hafnarfjörður MeanDecreaseAccuracy MeanDecreaseGini
## nuvirdi    71.980076    49.1254252            85.439271      438.5973905
## teg_eign   23.827163    28.5428285            36.750388       60.3286631
## byggar    124.946977   114.4920214           151.554557      844.0575297
## efstah     74.167542    68.3771813            89.259483      273.5793721
## fjmib      -1.816563     0.8169102            -1.087707        0.6270664
## lyfta      44.702231    68.8604151            77.365173      189.7682431
## ibm2       84.027618    58.6629606            93.879519      485.3454554
## fjhaed     33.307502    13.2461966            34.844120       48.2897846
## fjbilsk    39.591746    38.8272983            44.642370       78.3642810
## fjbkar     40.195348    37.1493116            44.103188       70.6318239
## fjsturt    37.904481    42.4633996            47.819352       88.8820822
## fjklos     23.194645    10.3885453            26.528252       32.0948492
## fjeld       3.542399     4.7323125             5.824346        5.7596036
## fjherb     42.349898    36.3851018            55.227162      123.4519335
## fjstof     17.404502    28.0386011            28.722796       53.2939965
## stig10      5.830844     4.8402550             7.403010        1.6296011
## bilskurm2  26.653938    55.5028891            60.014659      187.1525508
## svalm2     60.174350    56.7191416            65.865946      333.1113314
## geymm2     83.008966    71.0295989            95.127122      295.0756310
varImpPlot(fit_R.forest)

Support Vector Machine (SVM)
using kernel=radial method.

library(e1071)
svmfit =svm(svfn~., data=trainnew, kernel ="radial", gamma =1, cost =1)
summary(svmfit)
## 
## Call:
## svm(formula = svfn ~ ., data = trainnew, kernel = "radial", gamma = 1, 
##     cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  4914
## 
##  ( 2333 2581 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  Kópavogur Hafnarfjörður
tune.out=tune(svm ,svfn~.,data=trainnew ,kernel ="radial", ranges =list(cost=c( 0.01, 0.1, 1,5,10,100) ))
summary(tune.out)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   100
## 
## - best performance: 0.2018024 
## 
## - Detailed performance results:
##    cost     error  dispersion
## 1 1e-02 0.4477076 0.026558975
## 2 1e-01 0.3123911 0.021060426
## 3 1e+00 0.2588757 0.014569369
## 4 5e+00 0.2356634 0.015789139
## 5 1e+01 0.2252870 0.016321703
## 6 1e+02 0.2018024 0.009715444
# best model is cost=5
bestmod =tune.out$best.model
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = svfn ~ ., data = trainnew, ranges = list(cost = c(0.01, 
##     0.1, 1, 5, 10, 100)), kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  100 
## 
## Number of Support Vectors:  3558
## 
##  ( 1739 1819 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  Kópavogur Hafnarfjörður
ypred=predict(bestmod ,testnew )
table(predict =ypred , truth= testnew$svfn )
##                truth
## predict         Kópavogur Hafnarfjörður
##   Kópavogur           853           206
##   Hafnarfjörður       159           614
SVME=mean(ypred!= testnew$svfn)
SVME
## [1] 0.1992358

Bonus

Boosting method to predict 2019-2020 by using data from 2012-2018 (train data).
Check our prediction with the real data on 2019-2020 (test data). We made some adjustments on columns of train data and test data. Since, the information was not the same.

Datasubn=Datasub
Datasubn$teg_eign=fct_collapse(Datasubn$teg_eign, 
                              notimport=c( "Einbýlishús","Raðhús","Parhús" ))

C3=c("ibm2","year","matssvaedi","byggar", "bilskurm2","undirmatssvaedi", "geymm2","svalm2","fjsturt","fjbilsk","nuvirdi")
Datasubn=subset(Datasubn, teg_eign!="notimport")
Datasubn=Datasubn[,C3]
library(openxlsx)

dat_2020<- read.xlsx("https://www.skra.is/library/Samnyttar-skrar-/Fyrirtaeki-stofnanir/Fasteignamat-2020/gagnasafn_netid.xlsx", colNames = T, na.strings = "(null)", sheet = "Fjölbýli höfuðborg")

dat_2020$age_studull=2019-dat_2020$age_studull
dat_2020=rename(dat_2020, byggar="age_studull", year="ar", matssvaedi="hverfi", undirmatssvaedi="gata", fjsturt="bath_fixtures")
dat_2020=dat_2020[,C3]
dat_2020$ibm2=as.numeric(gsub(",",".",dat_2020$ibm2))
dat_2020$geymm2=as.numeric(gsub(",",".",dat_2020$geymm2))
dat_2020$svalm2=as.numeric(gsub(",",".",dat_2020$svalm2))
dat_2020$bilskurm2=as.numeric(gsub(",",".",dat_2020$bilskurm2))
dat_2020=na.omit(dat_2020)
set.seed(11)
boost.Datan=gbm(nuvirdi~.,Datasubn, distribution="gaussian", n.trees =1000 , interaction.depth =4, shrinkage=.1)
pred.boost=predict(boost.Datan, dat_2020, n.trees = 1000)
mean(abs(pred.boost-dat_2020$nuvirdi))
## [1] 6282.199

The error from our prediction data of 2019-2020 was large. Therefore, we decided to predict only 2018(test data) from 2012-2017 data (train data). We then used this new test data (prediction data of 2018) to predict 2019-2020 data.

Datasub$year1=as.integer(substr(Datasub$kdagur,7,10))
Datasubtrain=subset(Datasub, Datasub$year1<2018)[,-c(1,24)]
Datasubtest=subset(Datasub, Datasub$year1==2018)[,-c(1,24)]
boost.Datan2018=gbm(nuvirdi~.,Datasubtrain, distribution="gaussian", n.trees =1000 , interaction.depth =4, shrinkage=.1)
pred.boost2018=predict(boost.Datan2018, Datasubtest, n.trees = 1000)
mean(abs(pred.boost2018-Datasubtest$nuvirdi))
## [1] 710.6059
Datasubtest$nuvirdi=pred.boost2018
Datasetj=rbind(Datasubtrain,Datasubtest)
Datasetj$teg_eign=fct_collapse(Datasetj$teg_eign, 
                              notimport=c( "Einbýlishús","Raðhús","Parhús" ))


Datasetj=subset(Datasetj, teg_eign!="notimport")
Datasetj=rename(Datasetj,year="year1")
Datasetj=Datasetj[,C3]
boost.Datan2019=gbm(nuvirdi~.,Datasetj, distribution="gaussian", n.trees =1000 , interaction.depth =4, shrinkage=.1)
pred.boost2019=predict(boost.Datan2019, dat_2020, n.trees = 1000)
mean(abs(pred.boost2019-dat_2020$nuvirdi))
## [1] 3237.797